Differential expression using the bulk RNAseq JAX_RNAseq_Neuroectoderm data from the June 2024 release.
There is only one model system in this dataset.
There are samples from multiple days.
library(ggplot2)
library(ggrepel)
library(ggpubr)
library(stringr)
library(MASS)
library(RColorBrewer)
library(DESeq2)
library(viridis)
library(ggpointdensity)
library(dplyr)
library(data.table)
library(ComplexHeatmap)
library(UpSetR)
library(readxl)
theme_set(theme_classic())
#path = "../../data/MorPhiC/June-2024/JAX_RNAseq_Neuroectoderm/"
path = "../../MorPhiC/data/June-2024/JAX_RNAseq_Neuroectoderm/"
dat_path = paste0(path, "processed/release110-gencode44/Tables")meta = fread("data/June_2024/JAX_RNAseq_Neuroectoderm_meta_data.tsv",
data.table = FALSE)
dim(meta)## [1] 174 32
meta[1:2,]## biomaterial_id lib_biomaterial_id differentiated_biomaterial_id
## 1 MOK20002-WT GT23-05635 CBO-MOK20002-WT-Day7
## 2 MOK20002-WT GT23-05625 CBO-MOK20002-WT-Day6
## differentiated_biomaterial_description differentiation_protocol
## 1 KOLF2.2 derived cortical brain organoid JAXPD003
## 2 KOLF2.2 derived cortical brain organoid JAXPD003
## differentiated_timepoint_value differentiated_timepoint_unit
## 1 7 days
## 2 6 days
## differentiated_terminally_differentiated
## 1 No
## 2 No
## model_organ ko_strategy ko_gene
## 1 Cortical brain (dorsal forebrain pattering) WT WT
## 2 Cortical brain (dorsal forebrain pattering) WT WT
## library_preparation_protocol dissociation_protocol fragment_size input_amount
## 1 JAXPL001 N.A 410 300
## 2 JAXPL001 N.A 425 300
## input_unit final_yield final_yield_unit lib_concentration
## 1 ngs 249.6 ngs 51.8
## 2 ngs 170.0 ngs 33.8
## lib_concentration_unit PCR_cycle PCR_cycle_sample_index
## 1 nM 10 NA
## 2 nM 10 NA
## file_id sequencing_protocol
## 1 RFX_WT_Day7_GT23-05635_GCACGGAC-TGCGAGAC_S55_L002 JAXPS001
## 2 RFX_WT_Day6_GT23-05625_GACCTGAA-CTCACCAA_S97_L002 JAXPS001
## run_id biomaterial_description
## 1 20230424_23-scbct-028-run3 KOLF2.2
## 2 20230424_23-scbct-028-run3 KOLF2.2
## derived_cell_line_accession clone_id protocol_id zygosity type model_system
## 1 KOLF2.2J WT N.A N.A iPSC CBO
## 2 KOLF2.2J WT N.A N.A iPSC CBO
names(meta)## [1] "biomaterial_id"
## [2] "lib_biomaterial_id"
## [3] "differentiated_biomaterial_id"
## [4] "differentiated_biomaterial_description"
## [5] "differentiation_protocol"
## [6] "differentiated_timepoint_value"
## [7] "differentiated_timepoint_unit"
## [8] "differentiated_terminally_differentiated"
## [9] "model_organ"
## [10] "ko_strategy"
## [11] "ko_gene"
## [12] "library_preparation_protocol"
## [13] "dissociation_protocol"
## [14] "fragment_size"
## [15] "input_amount"
## [16] "input_unit"
## [17] "final_yield"
## [18] "final_yield_unit"
## [19] "lib_concentration"
## [20] "lib_concentration_unit"
## [21] "PCR_cycle"
## [22] "PCR_cycle_sample_index"
## [23] "file_id"
## [24] "sequencing_protocol"
## [25] "run_id"
## [26] "biomaterial_description"
## [27] "derived_cell_line_accession"
## [28] "clone_id"
## [29] "protocol_id"
## [30] "zygosity"
## [31] "type"
## [32] "model_system"
table(table(meta$biomaterial_id))##
## 1 2 3 4
## 36 5 16 20
table(table(meta$lib_biomaterial_id))##
## 1
## 174
table(table(meta$differentiated_biomaterial_id))##
## 1
## 174
For the row with column differentiated_biomaterial_id==“CBO-MOK20004-WT-Day3”, it should have value 3 instead of the original 4 in the column differentiated_timepoint_value.
print("Before correcting the error in timepoint for CBO-MOK20004-WT-Day3:")## [1] "Before correcting the error in timepoint for CBO-MOK20004-WT-Day3:"
print("Table between ko_gene and differentiated_timepoint_value:")## [1] "Table between ko_gene and differentiated_timepoint_value:"
table(meta$ko_gene, meta$differentiated_timepoint_value)##
## 3 4 5 6 7 8 9 14
## FEZF2 0 0 9 9 9 9 0 0
## LHX5 9 9 9 9 0 0 0 0
## LHX9 0 0 0 0 0 0 0 9
## OTX1 0 0 0 0 9 9 9 0
## PAX6 0 0 0 0 9 0 0 0
## RFX4 0 0 0 9 9 9 9 0
## WT 0 2 2 5 4 5 2 1
meta[which(meta$differentiated_biomaterial_id=="CBO-MOK20004-WT-Day3"),
"differentiated_timepoint_value"] = 3
print("After correcting the error in timepoint for CBO-MOK20004-WT-Day3:")## [1] "After correcting the error in timepoint for CBO-MOK20004-WT-Day3:"
print("Table between ko_gene and differentiated_timepoint_value:")## [1] "Table between ko_gene and differentiated_timepoint_value:"
table(meta$ko_gene, meta$differentiated_timepoint_value)##
## 3 4 5 6 7 8 9 14
## FEZF2 0 0 9 9 9 9 0 0
## LHX5 9 9 9 9 0 0 0 0
## LHX9 0 0 0 0 0 0 0 9
## OTX1 0 0 0 0 9 9 9 0
## PAX6 0 0 0 0 9 0 0 0
## RFX4 0 0 0 9 9 9 9 0
## WT 1 1 2 5 4 5 2 1
According to information from JAX, part of the samples from day 5 in fact should have a timepoint that is closer to day 6. This is the group of samples under day 5 and from run_id
20230522_23-scbct-052
For these samples, label them as day 6.
Since the other samples from day 5 only have one WT, in later DE analysis, ignore these samples and do not run DE analysis for day 5.
table(meta$run_id, meta$differentiated_timepoint_value)##
## 3 4 5 6 7 8 9 14
## 20230228_23-scbct-008 0 0 0 0 10 0 0 0
## 20230424_23-scbct-028-run3 0 0 0 10 10 10 0 0
## 20230508_23-scbct-039-run2 0 0 10 10 10 0 0 0
## 20230522_23-scbct-052 10 10 10 0 0 0 0 0
## 20230824_23-scbct-092 0 0 0 0 0 0 0 10
## 20240131_23-robson-020 0 0 0 0 10 10 10 0
## 20240307_24-robson-003 0 0 0 0 0 0 10 0
## 20240307_24-robson-004 0 0 0 12 0 0 0 0
## 20240307_24-robson-006 0 0 0 0 0 12 0 0
index_day5_to_change = which((meta$run_id=="20230522_23-scbct-052") &
(meta$differentiated_timepoint_value==5))
length(index_day5_to_change)## [1] 10
meta[index_day5_to_change, c("run_id", "differentiated_timepoint_value")]## run_id differentiated_timepoint_value
## 73 20230522_23-scbct-052 5
## 80 20230522_23-scbct-052 5
## 83 20230522_23-scbct-052 5
## 87 20230522_23-scbct-052 5
## 89 20230522_23-scbct-052 5
## 94 20230522_23-scbct-052 5
## 98 20230522_23-scbct-052 5
## 103 20230522_23-scbct-052 5
## 108 20230522_23-scbct-052 5
## 109 20230522_23-scbct-052 5
meta$differentiated_timepoint_value[index_day5_to_change] = 6
table(meta$run_id, meta$differentiated_timepoint_value)##
## 3 4 5 6 7 8 9 14
## 20230228_23-scbct-008 0 0 0 0 10 0 0 0
## 20230424_23-scbct-028-run3 0 0 0 10 10 10 0 0
## 20230508_23-scbct-039-run2 0 0 10 10 10 0 0 0
## 20230522_23-scbct-052 10 10 0 10 0 0 0 0
## 20230824_23-scbct-092 0 0 0 0 0 0 0 10
## 20240131_23-robson-020 0 0 0 0 10 10 10 0
## 20240307_24-robson-003 0 0 0 0 0 0 10 0
## 20240307_24-robson-004 0 0 0 12 0 0 0 0
## 20240307_24-robson-006 0 0 0 0 0 12 0 0
print("After correcting timepoint for some samples originally labeled as day 5:")## [1] "After correcting timepoint for some samples originally labeled as day 5:"
print("Table between ko_gene and differentiated_timepoint_value:")## [1] "Table between ko_gene and differentiated_timepoint_value:"
table(meta$ko_gene, meta$differentiated_timepoint_value)##
## 3 4 5 6 7 8 9 14
## FEZF2 0 0 9 9 9 9 0 0
## LHX5 9 9 0 18 0 0 0 0
## LHX9 0 0 0 0 0 0 0 9
## OTX1 0 0 0 0 9 9 9 0
## PAX6 0 0 0 0 9 0 0 0
## RFX4 0 0 0 9 9 9 9 0
## WT 1 1 1 6 4 5 2 1
cts = fread(file.path(dat_path, "genesCounts.csv"), data.table = FALSE)
dim(cts)## [1] 38592 175
cts[1:2, c(1:2, (ncol(cts)-1):ncol(cts))]## Name RFX_CE_F5_Day8_GT23-05641_GTCGGAGC-TTATAACC_S73_L002
## 1 ENSG00000268674 0
## 2 ENSG00000271254 1521
## LHX5_CE_A1_Day3_GT23-07300_AAGTCCAA-TACTCATA_S163_L004
## 1 0
## 2 1270
## LHX5_KO_E4_Day5_GT23-07318_AACGTTCC-AGTACTCC_S166_L004
## 1 0
## 2 746
stopifnot(setequal(names(cts)[-1], meta$file_id))
meta = meta[match(names(cts)[-1], meta$file_id),]
table(names(cts)[-1] == meta$file_id)##
## TRUE
## 174
cts_dat = data.matrix(cts[,-1])
rownames(cts_dat) = cts$Name
unique_timepoints = unique(meta$differentiated_timepoint_value)
unique_timepoints = sort(unique_timepoints)
xx = as.character(meta$differentiated_timepoint_value)
xx = factor(xx, levels = as.character(unique_timepoints))
meta$timepoint = xxgene_anno = fread("data/gencode_v44_primary_assembly_info.tsv")
dim(gene_anno)## [1] 62754 8
gene_anno[1:2,]## geneId chr strand start end ensembl_gene_id
## <char> <char> <char> <int> <int> <char>
## 1: ENSG00000000003.16 chrX - 100627108 100639991 ENSG00000000003
## 2: ENSG00000000005.6 chrX + 100584936 100599885 ENSG00000000005
## hgnc_symbol description
## <char> <char>
## 1: TSPAN6 tetraspanin 6 [Source:HGNC Symbol;Acc:HGNC:11858]
## 2: TNMD tenomodulin [Source:HGNC Symbol;Acc:HGNC:17757]
table(rownames(cts_dat) %in% gene_anno$ensembl_gene_id)##
## TRUE
## 38592
# all ensembl gene IDs are contained in the annotation
setdiff(rownames(cts_dat), gene_anno$ensembl_gene_id)## character(0)
model_s = meta$model_system
table(model_s, useNA="ifany")## model_s
## CBO
## 174
get_q75 <- function(x){quantile(x, probs = 0.75)}
gene_info = data.frame(Name = cts$Name,
apply(cts_dat, 1, tapply, model_s, min),
apply(cts_dat, 1, tapply, model_s, median),
apply(cts_dat, 1, tapply, model_s, get_q75),
apply(cts_dat, 1, tapply, model_s, max))
dim(gene_info)## [1] 38592 5
gene_info[1:2, ]## Name apply.cts_dat..1..tapply..model_s..min.
## ENSG00000268674 ENSG00000268674 0
## ENSG00000271254 ENSG00000271254 426
## apply.cts_dat..1..tapply..model_s..median.
## ENSG00000268674 0
## ENSG00000271254 1318
## apply.cts_dat..1..tapply..model_s..get_q75.
## ENSG00000268674 0.00
## ENSG00000271254 1701.75
## apply.cts_dat..1..tapply..model_s..max.
## ENSG00000268674 3
## ENSG00000271254 3117
table(row.names(gene_info)==gene_info$Name, useNA="ifany")##
## TRUE
## 38592
names(gene_info)[2:5] = paste0(rep(c("CBO_"), times=4),
rep(c("min", "median", "q75", "max"), each=1))
dim(gene_info)## [1] 38592 5
gene_info[1:2,]## Name CBO_min CBO_median CBO_q75 CBO_max
## ENSG00000268674 ENSG00000268674 0 0 0.00 3
## ENSG00000271254 ENSG00000271254 426 1318 1701.75 3117
summary(gene_info[,-1])## CBO_min CBO_median CBO_q75 CBO_max
## Min. : 0.0 Min. : 0.0 Min. : 0.0 Min. : 0
## 1st Qu.: 0.0 1st Qu.: 0.0 1st Qu.: 0.0 1st Qu.: 2
## Median : 0.0 Median : 3.0 Median : 5.8 Median : 22
## Mean : 213.8 Mean : 699.1 Mean : 931.4 Mean : 1829
## 3rd Qu.: 89.0 3rd Qu.: 332.1 3rd Qu.: 474.6 3rd Qu.: 1003
## Max. :88969.0 Max. :489181.5 Max. :586205.0 Max. :1301990
table(gene_info$CBO_q75 > 0)##
## FALSE TRUE
## 14390 24202
w2kp = which(gene_info$CBO_q75 > 0)
cts_dat = cts_dat[w2kp,]
dim(cts_dat)## [1] 24202 174
gene_info = gene_info[w2kp,]
meta$read_depth = colSums(cts_dat)/1e6
meta$q75 = apply(cts_dat, 2, quantile, probs = 0.75)
ggplot(meta, aes(x=read_depth, y=q75, color=ko_strategy)) +
geom_point(size=2, alpha=0.7) + ggtitle("Gene counts") +
scale_shape_manual(values = c(7, 16)) +
xlab("read depth (million)") + ylab("75 percentile of gene expression") +
labs(color = "KO type") +
scale_color_brewer(palette = "Set1")meta$run_id_short = str_replace(meta$run_id, "^[^-]*-", "")
ggplot(meta, aes(x=read_depth, y=q75, color=run_id_short)) +
geom_point(size=2, alpha=0.7) + ggtitle("Gene counts") +
scale_shape_manual(values = c(7, 16)) +
xlab("read depth (million)") + ylab("75 percentile of gene expression") +
labs(color = "Run ID") +
scale_color_brewer(palette = "Set1")sample2kp = which(meta$ko_gene == "WT")
cts_dat_m = cts_dat[,sample2kp]
meta_m = meta[sample2kp,]
summary(meta_m$q75)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 424.0 654.0 876.0 880.7 1047.0 1276.0
dim(cts_dat_m)## [1] 24202 21
stopifnot(all(meta_m$file_id == colnames(cts_dat_m)))
q75 = apply(cts_dat_m, 1, quantile, probs=0.75)
cts_dat_n = t(cts_dat_m[q75 > 0,])
cts_dat_n = log(median(meta_m$q75)*cts_dat_n/meta_m$q75 + 1)
dim(cts_dat_n)## [1] 21 23745
sample_pca = prcomp(cts_dat_n, center = TRUE, scale. = TRUE)
names(sample_pca)## [1] "sdev" "rotation" "center" "scale" "x"
pc_scores = sample_pca$x
eigen_vals = sample_pca$sdev^2
eigen_vals[1:5]/sum(eigen_vals)## [1] 0.27258414 0.16356916 0.10631824 0.07678308 0.05052017
pvs = round(eigen_vals[1:5]/sum(eigen_vals)*100,1)
pvs## [1] 27.3 16.4 10.6 7.7 5.1
PC_df = data.frame(pc_scores)
PC_df = cbind(PC_df, meta_m)
# there is only one level for model_system here
# no need to use model_system for the shape
gPC = ggplot(PC_df, aes(x = PC1, y = PC2,
color=run_id_short)) +
geom_point(size=2.5, alpha=0.7) +
xlab(sprintf("PC1 (%.1f%% variance)", pvs[1])) +
ylab(sprintf("PC2 (%.1f%% variance)", pvs[2])) +
ggtitle("Wild type samples") +
guides(
color = guide_legend(title = NULL, order = 1),
shape = guide_legend(title = NULL, order = 2)
) +
theme_classic() +
theme(
legend.margin = margin(0, 0, 0, 0),
legend.box.just = "left",
legend.direction = "vertical"
)
print(gPC)table(PC_df$run_id_short, PC_df$timepoint)##
## 3 4 5 6 7 8 9 14
## robson-003 0 0 0 0 0 0 1 0
## robson-004 0 0 0 3 0 0 0 0
## robson-006 0 0 0 0 0 3 0 0
## robson-020 0 0 0 0 1 1 1 0
## scbct-008 0 0 0 0 1 0 0 0
## scbct-028-run3 0 0 0 1 1 1 0 0
## scbct-039-run2 0 0 1 1 1 0 0 0
## scbct-052 1 1 0 1 0 0 0 0
## scbct-092 0 0 0 0 0 0 0 1
n_shapes = length(unique(PC_df$run_id_short))
u_shapes = c(1, 7, 8, 9, 10, 11, 15, 16, 17)[1:n_shapes]
n_time_point = length(unique(PC_df$timepoint))
gPC_time = ggplot(PC_df, aes(x = PC1, y = PC2,
color=timepoint,
shape=run_id_short)) +
geom_point(size=2.5) +
scale_shape_manual(values = u_shapes) +
scale_color_brewer(palette = "Dark2") +
xlab(sprintf("PC1 (%.1f%% variance)", pvs[1])) +
ylab(sprintf("PC2 (%.1f%% variance)", pvs[2])) +
ggtitle("Wild type samples") +
guides(
color = guide_legend(title = NULL, order = 1),
shape = guide_legend(title = NULL, order = 2)
) +
theme_classic() +
theme(
legend.margin = margin(0, 0, 0, 0),
legend.box.just = "left",
legend.direction = "vertical"
)
print(gPC_time)For the June 2024 release of JAX_RNAseq_Neuroectoderm, there is only one model system.
table(meta$run_id, meta$model_system, useNA="ifany")##
## CBO
## 20230228_23-scbct-008 10
## 20230424_23-scbct-028-run3 30
## 20230508_23-scbct-039-run2 30
## 20230522_23-scbct-052 30
## 20230824_23-scbct-092 10
## 20240131_23-robson-020 30
## 20240307_24-robson-003 10
## 20240307_24-robson-004 12
## 20240307_24-robson-006 12
table(meta$run_id, meta$ko_gene, useNA="ifany")##
## FEZF2 LHX5 LHX9 OTX1 PAX6 RFX4 WT
## 20230228_23-scbct-008 0 0 0 0 9 0 1
## 20230424_23-scbct-028-run3 0 0 0 0 0 27 3
## 20230508_23-scbct-039-run2 27 0 0 0 0 0 3
## 20230522_23-scbct-052 0 27 0 0 0 0 3
## 20230824_23-scbct-092 0 0 9 0 0 0 1
## 20240131_23-robson-020 0 0 0 27 0 0 3
## 20240307_24-robson-003 0 0 0 0 0 9 1
## 20240307_24-robson-004 0 9 0 0 0 0 3
## 20240307_24-robson-006 9 0 0 0 0 0 3
table(meta$ko_strategy, meta$ko_gene, useNA="ifany")##
## FEZF2 LHX5 LHX9 OTX1 PAX6 RFX4 WT
## CE 12 12 3 9 3 12 0
## KO 12 12 3 9 3 12 0
## PTC 12 12 3 9 3 12 0
## WT 0 0 0 0 0 0 21
df1 = as.data.frame(table(meta$timepoint, meta$ko_gene))
names(df1)[1:2] = c("time", "gene")
df1$Freq[df1$gene != "WT"] = df1$Freq[df1$gene != "WT"]/3
ggplot(df1, aes(x = time, y = gene, fill = Freq)) +
geom_tile(color = "black", linewidth = 0.3) +
scale_fill_gradient(low = "white", high = "red") +
theme_minimal() +
geom_text(aes(label = ifelse(gene == "WT", Freq, "")),
color = "black", size = 3) +
labs(x = "Time", y = "Gene")Explore the PC plots first, to decide running the models on what level of DE group.
Then run the DE analysis.
for(model1 in unique(meta$model_system)){
print(model1)
sample2kp = which(meta$model_system == model1)
cts_dat_m = cts_dat[,sample2kp]
meta_m = meta[sample2kp,]
table(meta_m$ko_strategy)
stopifnot(all(meta_m$file_id == colnames(cts_dat_m)))
# verify the consistency between ko_strategy from metadata column and
# that from filenames
extracted_ko_strings = str_extract_all(meta_m$file_id, "CE|KO|PTC|WT")
print(table(sapply(extracted_ko_strings, length)))
print(table(meta_m$ko_strategy, unlist(extracted_ko_strings)))
q75 = apply(cts_dat_m, 1, quantile, probs=0.75)
cts_dat_n = t(cts_dat_m[q75 > 0,])
cts_dat_n = log(median(meta_m$q75)*cts_dat_n/meta_m$q75 + 1)
dim(cts_dat_n)
sample_pca = prcomp(cts_dat_n, center = TRUE, scale. = TRUE)
names(sample_pca)
pc_scores = sample_pca$x
eigen_vals = sample_pca$sdev^2
eigen_vals[1:5]/sum(eigen_vals)
pvs = round(eigen_vals[1:5]/sum(eigen_vals)*100,1)
pvs
PC_df = data.frame(pc_scores)
PC_df = cbind(PC_df, meta_m)
gPC = ggplot(PC_df, aes(x = PC1, y = PC2, shape=ko_strategy, color=ko_gene)) +
geom_point(size=2.5, alpha=0.7) + scale_color_brewer(palette = "Dark2") +
scale_shape_manual(values = c(7, 15, 16, 17)) +
xlab(sprintf("PC1 (%.1f%% variance)", pvs[1])) +
ylab(sprintf("PC2 (%.1f%% variance)", pvs[2])) +
ggtitle(unique(model_s)) +
guides(
color = guide_legend(title = NULL, order = 1),
shape = guide_legend(title = NULL, order = 2)
) +
theme_classic() +
theme(
legend.margin = margin(0, 0, 0, 0),
legend.box.just = "left",
legend.direction = "vertical"
)
print(gPC)
# too many levels for shape, need to manually specify it
print(length(unique(PC_df$ko_gene)))
print(length(unique(PC_df$run_id_short)))
gPC = ggplot(PC_df, aes(x = PC1, y = PC2, shape=run_id_short, color=ko_gene)) +
geom_point(size=2.5, alpha=0.7) +
scale_color_brewer(palette = "Dark2") +
scale_shape_manual(values = c(1, 7, 8, 9, 10, 11, 15, 16, 17)) +
xlab(sprintf("PC1 (%.1f%% variance)", pvs[1])) +
ylab(sprintf("PC2 (%.1f%% variance)", pvs[2])) +
ggtitle(unique(model_s)) +
guides(
color = guide_legend(title = NULL, order = 1),
shape = guide_legend(title = NULL, order = 2)
) +
theme_classic() +
theme(
legend.margin = margin(0, 0, 0, 0),
legend.box.just = "left",
legend.direction = "vertical"
)
print(gPC)
print(table(meta_m$run_id_short, meta_m$ko_gene))
colnames(PC_df)[which(colnames(PC_df)=="timepoint")] = "day"
gPC = ggplot(PC_df, aes(x = PC1, y = PC2, shape=ko_gene, color=day)) +
geom_point(size=2.5, alpha=0.7) +
scale_color_brewer(palette = "Dark2") +
scale_shape_manual(values = c(1, 7, 8, 9, 10, 11, 15, 16, 17)[1:length(unique(PC_df$ko_gene))]) +
xlab(sprintf("PC1 (%.1f%% variance)", pvs[1])) +
ylab(sprintf("PC2 (%.1f%% variance)", pvs[2])) +
ggtitle(unique(model_s)) +
guides(
color = guide_legend(title = NULL, order = 1),
shape = guide_legend(title = NULL, order = 2)
) +
theme_classic() +
theme(
legend.margin = margin(0, 0, 0, 0),
legend.box.just = "left",
legend.direction = "vertical"
)
print(gPC)
gPC = ggplot(PC_df, aes(x = PC1, y = PC2, shape=ko_gene, color=day,
alpha=ifelse(ko_gene == "WT", 1, 0.6))) +
geom_point(size=2.5) +
scale_color_brewer(palette = "Dark2") +
scale_shape_manual(values = c(1, 7, 8, 9, 10, 11, 15, 16, 17)[1:length(unique(PC_df$ko_gene))]) +
xlab(sprintf("PC1 (%.1f%% variance)", pvs[1])) +
ylab(sprintf("PC2 (%.1f%% variance)", pvs[2])) +
ggtitle(unique(model_s)) +
guides(
color = guide_legend(title = NULL, order = 1),
shape = guide_legend(title = NULL, order = 2),
alpha = "none"
) +
theme_classic() +
theme(
legend.margin = margin(0, 0, 0, 0),
legend.box.just = "left",
legend.direction = "vertical"
)
print(gPC)
}## [1] "CBO"
##
## 1
## 174
##
## CE KO PTC WT
## CE 51 0 0 0
## KO 0 51 0 0
## PTC 0 0 51 0
## WT 0 0 0 21
## [1] 7
## [1] 9
##
## FEZF2 LHX5 LHX9 OTX1 PAX6 RFX4 WT
## robson-003 0 0 0 0 0 9 1
## robson-004 0 9 0 0 0 0 3
## robson-006 9 0 0 0 0 0 3
## robson-020 0 0 0 27 0 0 3
## scbct-008 0 0 0 0 9 0 1
## scbct-028-run3 0 0 0 0 0 27 3
## scbct-039-run2 27 0 0 0 0 0 3
## scbct-052 0 27 0 0 0 0 3
## scbct-092 0 0 9 0 0 0 1
Use day to construct DE groups.
Run DESeq2 for different day groups separately
day 3 + day 4, with a day indictor
day 5 (do not run for day 5, since after updating day label, the samples left only have one WT)
day 6
day 7
day 8 (exclude samples from run ID short scbct-028-run3)
day 9 (do not include day 14 in this setting)
For the output files, create two versions, one with direct information, one with padj set to NA for the genes when the number of 0 per test is >=4.
In more details, for both versions of the outputs, make it one file per knockout gene per DE group (DE group can be model system, model system + day + condition, model system + day + run_id, model system + day, etc.). The first version of the output files contains gene id, and for each knockout strategy:
baseMean, log2FoldChange, lfcSE, stat, pvalue, padj
The second version of the output files contains gene id,
chr, strand, position, gene name
and for each knockout strategy:
log2FoldChange, pvalue, padj (set to be NA if the number of 0 per test >= 4),
In addition, save out a separate file of the sample size for each comparison. This needs to be by DE group.
df_size = NULL
release_name = "2024_06_JAX_RNAseq_Neuroectoderm"
output_dir = file.path("results", release_name)
raw_output_dir = file.path(output_dir, "raw")
processed_output_dir = file.path(output_dir, "processed")
if (!file.exists(raw_output_dir)){
dir.create(raw_output_dir, recursive = TRUE)
}
if (!file.exists(processed_output_dir)){
dir.create(processed_output_dir, recursive = TRUE)
}for(model1 in unique(meta$model_system)){
print(model1)
sample2kp = which(meta$model_system == model1)
cts_dat_m = cts_dat[,sample2kp]
meta_m = meta[sample2kp,]
table(meta_m$ko_strategy)
stopifnot(all(meta_m$file_id == colnames(cts_dat_m)))
## Generate sample information matrix
cols2kp = c("model_system", "ko_strategy", "ko_gene", "run_id", "run_id_short",
"timepoint", "q75")
sample_info = meta_m[,cols2kp]
colnames(sample_info)[which(colnames(sample_info)=="timepoint")] = "day"
dim(sample_info)
sample_info[1:2,]
table(sample_info$ko_strategy, sample_info$ko_gene)
sample_info$ko_group = paste(sample_info$ko_gene,
sample_info$ko_strategy, sep="_")
print(table(sample_info$ko_group))
print(length(unique(sample_info$ko_group)))
print(table(sample_info$run_id, sample_info$ko_group))
print(table(sample_info$ko_group, sample_info$day))
sample_info$ko_group_run_id = paste(sample_info$ko_group,
sample_info$run_id_short, sep="_")
print(table(sample_info$ko_gene, sample_info$day))
print(table(sample_info$ko_group_run_id, sample_info$day))
# print the table of ko_group and run_id by day
for (cur_day in as.character(unique_timepoints)){
sample_info_by_day = sample_info[which(sample_info$day==cur_day), ]
print("---------------------------")
print(paste0("On day ", as.character(cur_day), ": "))
print(table(sample_info_by_day$ko_group, sample_info_by_day$run_id_short))
}
#sample_info$log_q75 = log(sample_info$q75)
# run DESeq2 for different day groups separately
# day 3 + day 4
# day 5
# day 6
# day 7
# day 8 (not excluding RFX4 for now)
# day 9 (do not include day 14 in this setting)
# do not include run_id in the model if there is only one run_id in certain day
sample_info$day_group = as.character(sample_info$day)
sample_info$day_group[which(sample_info$day_group%in%c("3", "4"))] = "3_4"
table(sample_info$day_group)
sample_info$de_grp = paste0(sample_info$model_system, "_day_", sample_info$day_group)
unique_de_grps = setdiff(unique(sample_info$de_grp),
c(paste0(model1, "_day_5"), paste0(model1, "_day_14")))
sorted_de_grps = sort(unique_de_grps)
for (d_group in sorted_de_grps){
print("===================================")
print(sprintf("day group %s DESeq2 results:", d_group))
print("===================================")
dg_index = which(sample_info$de_grp==d_group)
cts_dat_m_dg = cts_dat_m[, dg_index]
sample_info_dg = sample_info[dg_index, ]
# additional step for DE group CBO_day_8, to exclude samples from run id short scbct-028-run3
if (d_group == "CBO_day_8"){
kept_index = which(sample_info_dg$run_id_short!="scbct-028-run3")
cts_dat_m_dg = cts_dat_m_dg[, kept_index]
sample_info_dg = sample_info_dg[kept_index, ]
stopifnot(all(sample_info_dg$run_id_short!="scbct-028-run3"))
}
stopifnot(all(sample_info_dg$de_grp==d_group))
# do two steps of filtering
# first, filter based on q75 of each gene
# second, filter based on whether each gene is expressed in at least 4 cells
dg_q75 = apply(cts_dat_m_dg, 1, quantile, probs=0.75)
cts_dat_m_dg = cts_dat_m_dg[dg_q75 > 0,]
print(sprintf("After filtering by gene expression q75, the number of genes reduces from %d to %d",
length(dg_q75), nrow(cts_dat_m_dg)))
n_pos = rowSums(cts_dat_m_dg>0)
cts_dat_m_dg = cts_dat_m_dg[n_pos >= 4,]
if (length(n_pos)==nrow(cts_dat_m_dg)){
print("After filtering by nonzero expression in at least 4 samples, the number of genes does not change.")
}else{
print(sprintf("After filtering by nonzero expression in at least 4 samples, the number of genes reduces from %d to %d",
length(n_pos), nrow(cts_dat_m_dg)))
}
# update the q75 based on only genes that will be used in the DE analysis
sample_info_dg$q75 = apply(cts_dat_m_dg, 2, quantile, probs = 0.75)
sample_info_dg$log_q75 = log(sample_info_dg$q75)
# add day ID as another covariate for day group 3_4
# but cannot do that for day group 9_14,
# since day 14 completely overlaps with run ID scbct-092
# for day 9 and day 14, only run analysis on day 9 samples
# since day 3_4 only has one run id
# write out the setting for day 3_4 specifically
if (d_group == "CBO_day_3_4"){
dds = DESeqDataSetFromMatrix(cts_dat_m_dg, colData=sample_info_dg,
~ ko_group + day + log_q75)
}else if (length(unique(sample_info_dg$run_id))==1){
dds = DESeqDataSetFromMatrix(cts_dat_m_dg, colData=sample_info_dg,
~ ko_group + log_q75)
}else{
dds = DESeqDataSetFromMatrix(cts_dat_m_dg, colData=sample_info_dg,
~ ko_group + run_id + log_q75)
}
start.time <- Sys.time()
dds = DESeq(dds)
end.time <- Sys.time()
time.taken <- end.time - start.time
print(time.taken)
## association with read-depth
res_rd = results(dds, name = "log_q75")
res_rd = as.data.frame(res_rd)
dim(res_rd)
res_rd[1:2,]
table(is.na(res_rd$padj))
g0 = ggplot(res_rd %>% subset(!is.na(padj)), aes(x=pvalue)) +
geom_histogram(color="darkblue", fill="lightblue",
breaks = seq(0,1,by=0.01)) +
ggtitle(paste0(d_group, " read depth"))
print(g0)
## Rerun the analysis without read-depth if it is not significant for
## most genes, and the number of samples is small
## i.e., proportion of non-null in the distribution is smaller than 0.01
## (this needs to be restricted to the genes with not NA adjusted pvalue)
## or if smaller than 0.1 and the number of samples is not greater than 6
pi_1_rd = max(0, mean(res_rd$pvalue[!is.na(res_rd$padj)] < 0.05) - 0.05)
pi_1_rd = max(pi_1_rd, 0, 1 - 2*mean(res_rd$pvalue[!is.na(res_rd$padj)] > 0.5))
print(sprintf("prop of non-null for rd: %.2e", pi_1_rd))
if(pi_1_rd < 0.01 || (ncol(cts_dat_m_dg) <= 6 && pi_1_rd < 0.1 )){
print("rerun DESeq2 without read depth")
include_rd = 0
if (d_group == "CBO_day_3_4"){
dds = DESeqDataSetFromMatrix(cts_dat_m_dg, colData=sample_info_dg,
~ ko_group + day)
}else if (length(unique(sample_info_dg$run_id))==1){
dds = DESeqDataSetFromMatrix(cts_dat_m_dg, colData=sample_info_dg,
~ ko_group)
}else{
dds = DESeqDataSetFromMatrix(cts_dat_m_dg, colData=sample_info_dg,
~ ko_group + run_id)
}
dds = DESeq(dds)
}else{
include_rd = 1
}
## association with run_id
if (length(unique(sample_info_dg$run_id))>1){
# de_group 3_4 only has one run id, this part is not relevant for it
if(include_rd)
dds_lrt = DESeq(dds, test="LRT", reduced = ~ ko_group + log_q75)
else
dds_lrt = DESeq(dds, test="LRT", reduced = ~ ko_group)
res_lrt = results(dds_lrt)
res_run_id = as.data.frame(res_lrt)
dim(res_run_id)
res_run_id[1:2,]
table(is.na(res_run_id$padj))
g0 = ggplot(res_run_id %>% subset(!is.na(padj)), aes(x=pvalue)) +
geom_histogram(color="darkblue", fill="lightblue",
breaks = seq(0,1,by=0.01)) +
ggtitle(paste0(d_group, " Run ID"))
print(g0)
}
## DE analysis for each KO gene
table(sample_info_dg$ko_gene)
table(sample_info_dg$ko_gene, sample_info_dg$ko_strategy)
genos = setdiff(unique(sample_info_dg$ko_gene), "WT")
genos
ko_grp = c("CE", "KO", "PTC")
for(geno in genos){
res_geno_df = NULL
res_geno_reduced_df = NULL
res_geno = list()
for(k_grp1 in ko_grp){
res_g1 = results(dds, contrast = c("ko_group",
paste0(geno, "_", k_grp1), "WT_WT"))
res_g1 = signif(data.frame(res_g1), 3)
# add a column to record the number of zeros per test
test_index = which(sample_info_dg$ko_group%in%c(paste0(geno, "_", k_grp1), "WT_WT"))
cts_dat_m_dg_test = cts_dat_m_dg[, test_index]
n_zero = rowSums(cts_dat_m_dg_test==0)
res_g1$n_zeros = n_zero
# record the number of samples involved in the comparison
test_ko_group_vec = sample_info_dg$ko_group[test_index]
if (d_group=="CBO_day_8"){
d_group_for_size = paste0(d_group, "_excluding_20230424_23-scbct-028-run3")
}else{
d_group_for_size = d_group
}
df_size = rbind(df_size,
c(d_group_for_size,
paste0(geno, "_", k_grp1),
sum(test_ko_group_vec!="WT_WT"),
sum(test_ko_group_vec=="WT_WT")))
# prepare a processed version of output
res_g1_reduced = res_g1[, c("log2FoldChange", "pvalue", "padj", "n_zeros")]
res_g1_reduced$padj[which(res_g1_reduced$n_zeros>=4)] = NA
if ((sum(test_ko_group_vec!="WT_WT")==1)|(sum(test_ko_group_vec=="WT_WT")==1)){
res_g1_reduced$padj = NA
}
res_g1_reduced = res_g1_reduced[, c("log2FoldChange", "pvalue", "padj")]
res_geno[[k_grp1]] = res_g1_reduced
g1 = ggplot(res_g1_reduced %>% subset(!is.na(padj)), aes(x=pvalue)) +
geom_histogram(color="darkblue", fill="lightblue",
breaks = seq(0,1,by=0.01)) +
ggtitle(paste0(d_group, " ", geno, "_", k_grp1))
print(g1)
tag1 = sprintf("%s_%s_", geno, k_grp1)
colnames(res_g1) = paste0(tag1, colnames(res_g1))
colnames(res_g1_reduced) = paste0(tag1, colnames(res_g1_reduced))
if(is.null(res_geno_df)){
res_geno_df = res_g1
}else if(!is.null(res_geno_df)){
stopifnot(all(rownames(res_geno_df) == rownames(res_g1)))
res_geno_df = cbind(res_geno_df, res_g1)
}
if(is.null(res_geno_reduced_df)){
res_geno_reduced_df = res_g1_reduced
}else if(!is.null(res_geno_reduced_df)){
stopifnot(all(rownames(res_geno_reduced_df) == rownames(res_g1_reduced)))
res_geno_reduced_df = cbind(res_geno_reduced_df, res_g1_reduced)
}
}
get_index <- function(x){
which(x$padj < 0.05 & abs(x$log2FoldChange) > log2(1.5))
}
w_ce = get_index(res_geno[["CE"]])
w_ko = get_index(res_geno[["KO"]])
w_ptc = get_index(res_geno[["PTC"]])
print(paste0("DE group ", d_group, " under KO gene ", geno, ":"))
print(paste0("number of DE genes from knock out strategy CE: ",
as.character(length(w_ce))))
print(paste0("number of DE genes from knock out strategy KO: ",
as.character(length(w_ko))))
print(paste0("number of DE genes from knock out strategy PTC: ",
as.character(length(w_ptc))))
ce_ko_overlap = length(intersect(rownames(res_geno[["CE"]])[w_ce],
rownames(res_geno[["KO"]])[w_ko]))
ko_ptc_overlap = length(intersect(rownames(res_geno[["KO"]])[w_ko],
rownames(res_geno[["PTC"]])[w_ptc]))
ptc_ce_overlap = length(intersect(rownames(res_geno[["PTC"]])[w_ptc],
rownames(res_geno[["CE"]])[w_ce]))
print(paste0("number of common DE genes by knock out strategies CE and KO: ",
as.character(ce_ko_overlap)))
print(paste0("number of common DE genes by knock out strategies KO and PTC: ",
as.character(ko_ptc_overlap)))
print(paste0("number of common DE genes by knock out strategies PTC and CE: ",
as.character(ptc_ce_overlap)))
if (max(length(w_ce), length(w_ko), length(w_ptc)) > 0){
m = make_comb_mat(list("CE" = rownames(res_geno[["CE"]])[w_ce],
"KO" = rownames(res_geno[["KO"]])[w_ko],
"PTC" = rownames(res_geno[["PTC"]])[w_ptc]))
g_up = UpSet(m)
print(g_up)
}
df1 = data.frame(padj_CE = res_geno[["CE"]]$padj,
padj_KO = res_geno[["KO"]]$padj,
padj_PTC = res_geno[["PTC"]]$padj)
cr1 = cor(df1$padj_PTC, df1$padj_CE, method = "spearman", use="pair")
cr2 = cor(df1$padj_PTC, df1$padj_KO, method = "spearman", use="pair")
g4 = ggplot(data = df1, mapping = aes(x = -log10(padj_PTC),
y = -log10(padj_CE))) +
geom_pointdensity() +
ggtitle(sprintf("%s, %s\nSpearman r = %.2f", d_group, geno, cr1)) +
scale_color_viridis()
g5 = ggplot(data = df1, mapping = aes(x = -log10(padj_PTC),
y = -log10(padj_KO))) +
geom_pointdensity() +
ggtitle(sprintf("%s, %s\nSpearman r = %.2f", d_group, geno, cr2)) +
scale_color_viridis()
print(g4)
print(g5)
dim(res_geno_df)
res_geno_df[1:2,]
dim(res_geno_reduced_df)
res_geno_reduced_df[1:2,]
res_df = data.frame(gene_ID=rownames(res_geno_df),
res_geno_df)
dim(res_df)
res_df[1:2,]
res_reduced_gene_anno = gene_anno[match(rownames(res_geno_reduced_df),
gene_anno$ensembl_gene_id), ]
stopifnot(all(rownames(res_geno_reduced_df)==res_reduced_gene_anno$ensembl_gene_id))
# set gene hgnc_symbol that equal "" to NA
hgnc_symbol_vec = res_reduced_gene_anno$hgnc_symbol
hgnc_symbol_vec[which(hgnc_symbol_vec=="")] = NA
res_reduced_df = data.frame(gene_ID=rownames(res_geno_reduced_df),
hgnc_symbol=hgnc_symbol_vec,
chr=res_reduced_gene_anno$chr,
strand=res_reduced_gene_anno$strand,
start=res_reduced_gene_anno$start,
end=res_reduced_gene_anno$end,
res_geno_reduced_df)
print("hgnc_symbol empty string and NA tables:")
print(table(res_reduced_df$hgnc_symbol=="", useNA="ifany"))
print(table(is.na(res_reduced_df$hgnc_symbol)))
dim(res_reduced_df)
res_reduced_df[1:2,]
fwrite(res_df,
sprintf("%s/%s_%s_%s_DEseq2_raw.tsv",
raw_output_dir, release_name, d_group, geno),
sep="\t")
fwrite(res_reduced_df,
sprintf("%s/%s_%s_%s_DEseq2.tsv",
processed_output_dir, release_name, d_group, geno),
sep="\t")
}
}
}## [1] "CBO"
##
## FEZF2_CE FEZF2_KO FEZF2_PTC LHX5_CE LHX5_KO LHX5_PTC LHX9_CE LHX9_KO
## 12 12 12 12 12 12 3 3
## LHX9_PTC OTX1_CE OTX1_KO OTX1_PTC PAX6_CE PAX6_KO PAX6_PTC RFX4_CE
## 3 9 9 9 3 3 3 12
## RFX4_KO RFX4_PTC WT_WT
## 12 12 21
## [1] 19
##
## FEZF2_CE FEZF2_KO FEZF2_PTC LHX5_CE LHX5_KO
## 20230228_23-scbct-008 0 0 0 0 0
## 20230424_23-scbct-028-run3 0 0 0 0 0
## 20230508_23-scbct-039-run2 9 9 9 0 0
## 20230522_23-scbct-052 0 0 0 9 9
## 20230824_23-scbct-092 0 0 0 0 0
## 20240131_23-robson-020 0 0 0 0 0
## 20240307_24-robson-003 0 0 0 0 0
## 20240307_24-robson-004 0 0 0 3 3
## 20240307_24-robson-006 3 3 3 0 0
##
## LHX5_PTC LHX9_CE LHX9_KO LHX9_PTC OTX1_CE OTX1_KO
## 20230228_23-scbct-008 0 0 0 0 0 0
## 20230424_23-scbct-028-run3 0 0 0 0 0 0
## 20230508_23-scbct-039-run2 0 0 0 0 0 0
## 20230522_23-scbct-052 9 0 0 0 0 0
## 20230824_23-scbct-092 0 3 3 3 0 0
## 20240131_23-robson-020 0 0 0 0 9 9
## 20240307_24-robson-003 0 0 0 0 0 0
## 20240307_24-robson-004 3 0 0 0 0 0
## 20240307_24-robson-006 0 0 0 0 0 0
##
## OTX1_PTC PAX6_CE PAX6_KO PAX6_PTC RFX4_CE RFX4_KO
## 20230228_23-scbct-008 0 3 3 3 0 0
## 20230424_23-scbct-028-run3 0 0 0 0 9 9
## 20230508_23-scbct-039-run2 0 0 0 0 0 0
## 20230522_23-scbct-052 0 0 0 0 0 0
## 20230824_23-scbct-092 0 0 0 0 0 0
## 20240131_23-robson-020 9 0 0 0 0 0
## 20240307_24-robson-003 0 0 0 0 3 3
## 20240307_24-robson-004 0 0 0 0 0 0
## 20240307_24-robson-006 0 0 0 0 0 0
##
## RFX4_PTC WT_WT
## 20230228_23-scbct-008 0 1
## 20230424_23-scbct-028-run3 9 3
## 20230508_23-scbct-039-run2 0 3
## 20230522_23-scbct-052 0 3
## 20230824_23-scbct-092 0 1
## 20240131_23-robson-020 0 3
## 20240307_24-robson-003 3 1
## 20240307_24-robson-004 0 3
## 20240307_24-robson-006 0 3
##
## 3 4 5 6 7 8 9 14
## FEZF2_CE 0 0 3 3 3 3 0 0
## FEZF2_KO 0 0 3 3 3 3 0 0
## FEZF2_PTC 0 0 3 3 3 3 0 0
## LHX5_CE 3 3 0 6 0 0 0 0
## LHX5_KO 3 3 0 6 0 0 0 0
## LHX5_PTC 3 3 0 6 0 0 0 0
## LHX9_CE 0 0 0 0 0 0 0 3
## LHX9_KO 0 0 0 0 0 0 0 3
## LHX9_PTC 0 0 0 0 0 0 0 3
## OTX1_CE 0 0 0 0 3 3 3 0
## OTX1_KO 0 0 0 0 3 3 3 0
## OTX1_PTC 0 0 0 0 3 3 3 0
## PAX6_CE 0 0 0 0 3 0 0 0
## PAX6_KO 0 0 0 0 3 0 0 0
## PAX6_PTC 0 0 0 0 3 0 0 0
## RFX4_CE 0 0 0 3 3 3 3 0
## RFX4_KO 0 0 0 3 3 3 3 0
## RFX4_PTC 0 0 0 3 3 3 3 0
## WT_WT 1 1 1 6 4 5 2 1
##
## 3 4 5 6 7 8 9 14
## FEZF2 0 0 9 9 9 9 0 0
## LHX5 9 9 0 18 0 0 0 0
## LHX9 0 0 0 0 0 0 0 9
## OTX1 0 0 0 0 9 9 9 0
## PAX6 0 0 0 0 9 0 0 0
## RFX4 0 0 0 9 9 9 9 0
## WT 1 1 1 6 4 5 2 1
##
## 3 4 5 6 7 8 9 14
## FEZF2_CE_robson-006 0 0 0 0 0 3 0 0
## FEZF2_CE_scbct-039-run2 0 0 3 3 3 0 0 0
## FEZF2_KO_robson-006 0 0 0 0 0 3 0 0
## FEZF2_KO_scbct-039-run2 0 0 3 3 3 0 0 0
## FEZF2_PTC_robson-006 0 0 0 0 0 3 0 0
## FEZF2_PTC_scbct-039-run2 0 0 3 3 3 0 0 0
## LHX5_CE_robson-004 0 0 0 3 0 0 0 0
## LHX5_CE_scbct-052 3 3 0 3 0 0 0 0
## LHX5_KO_robson-004 0 0 0 3 0 0 0 0
## LHX5_KO_scbct-052 3 3 0 3 0 0 0 0
## LHX5_PTC_robson-004 0 0 0 3 0 0 0 0
## LHX5_PTC_scbct-052 3 3 0 3 0 0 0 0
## LHX9_CE_scbct-092 0 0 0 0 0 0 0 3
## LHX9_KO_scbct-092 0 0 0 0 0 0 0 3
## LHX9_PTC_scbct-092 0 0 0 0 0 0 0 3
## OTX1_CE_robson-020 0 0 0 0 3 3 3 0
## OTX1_KO_robson-020 0 0 0 0 3 3 3 0
## OTX1_PTC_robson-020 0 0 0 0 3 3 3 0
## PAX6_CE_scbct-008 0 0 0 0 3 0 0 0
## PAX6_KO_scbct-008 0 0 0 0 3 0 0 0
## PAX6_PTC_scbct-008 0 0 0 0 3 0 0 0
## RFX4_CE_robson-003 0 0 0 0 0 0 3 0
## RFX4_CE_scbct-028-run3 0 0 0 3 3 3 0 0
## RFX4_KO_robson-003 0 0 0 0 0 0 3 0
## RFX4_KO_scbct-028-run3 0 0 0 3 3 3 0 0
## RFX4_PTC_robson-003 0 0 0 0 0 0 3 0
## RFX4_PTC_scbct-028-run3 0 0 0 3 3 3 0 0
## WT_WT_robson-003 0 0 0 0 0 0 1 0
## WT_WT_robson-004 0 0 0 3 0 0 0 0
## WT_WT_robson-006 0 0 0 0 0 3 0 0
## WT_WT_robson-020 0 0 0 0 1 1 1 0
## WT_WT_scbct-008 0 0 0 0 1 0 0 0
## WT_WT_scbct-028-run3 0 0 0 1 1 1 0 0
## WT_WT_scbct-039-run2 0 0 1 1 1 0 0 0
## WT_WT_scbct-052 1 1 0 1 0 0 0 0
## WT_WT_scbct-092 0 0 0 0 0 0 0 1
## [1] "---------------------------"
## [1] "On day 3: "
##
## scbct-052
## LHX5_CE 3
## LHX5_KO 3
## LHX5_PTC 3
## WT_WT 1
## [1] "---------------------------"
## [1] "On day 4: "
##
## scbct-052
## LHX5_CE 3
## LHX5_KO 3
## LHX5_PTC 3
## WT_WT 1
## [1] "---------------------------"
## [1] "On day 5: "
##
## scbct-039-run2
## FEZF2_CE 3
## FEZF2_KO 3
## FEZF2_PTC 3
## WT_WT 1
## [1] "---------------------------"
## [1] "On day 6: "
##
## robson-004 scbct-028-run3 scbct-039-run2 scbct-052
## FEZF2_CE 0 0 3 0
## FEZF2_KO 0 0 3 0
## FEZF2_PTC 0 0 3 0
## LHX5_CE 3 0 0 3
## LHX5_KO 3 0 0 3
## LHX5_PTC 3 0 0 3
## RFX4_CE 0 3 0 0
## RFX4_KO 0 3 0 0
## RFX4_PTC 0 3 0 0
## WT_WT 3 1 1 1
## [1] "---------------------------"
## [1] "On day 7: "
##
## robson-020 scbct-008 scbct-028-run3 scbct-039-run2
## FEZF2_CE 0 0 0 3
## FEZF2_KO 0 0 0 3
## FEZF2_PTC 0 0 0 3
## OTX1_CE 3 0 0 0
## OTX1_KO 3 0 0 0
## OTX1_PTC 3 0 0 0
## PAX6_CE 0 3 0 0
## PAX6_KO 0 3 0 0
## PAX6_PTC 0 3 0 0
## RFX4_CE 0 0 3 0
## RFX4_KO 0 0 3 0
## RFX4_PTC 0 0 3 0
## WT_WT 1 1 1 1
## [1] "---------------------------"
## [1] "On day 8: "
##
## robson-006 robson-020 scbct-028-run3
## FEZF2_CE 3 0 0
## FEZF2_KO 3 0 0
## FEZF2_PTC 3 0 0
## OTX1_CE 0 3 0
## OTX1_KO 0 3 0
## OTX1_PTC 0 3 0
## RFX4_CE 0 0 3
## RFX4_KO 0 0 3
## RFX4_PTC 0 0 3
## WT_WT 3 1 1
## [1] "---------------------------"
## [1] "On day 9: "
##
## robson-003 robson-020
## OTX1_CE 0 3
## OTX1_KO 0 3
## OTX1_PTC 0 3
## RFX4_CE 3 0
## RFX4_KO 3 0
## RFX4_PTC 3 0
## WT_WT 1 1
## [1] "---------------------------"
## [1] "On day 14: "
##
## scbct-092
## LHX9_CE 3
## LHX9_KO 3
## LHX9_PTC 3
## WT_WT 1
## [1] "==================================="
## [1] "day group CBO_day_3_4 DESeq2 results:"
## [1] "==================================="
## [1] "After filtering by gene expression q75, the number of genes reduces from 24202 to 23242"
## [1] "After filtering by nonzero expression in at least 4 samples, the number of genes does not change."
## Time difference of 23.19633 secs
## [1] "prop of non-null for rd: 0.00e+00"
## [1] "rerun DESeq2 without read depth"
## [1] "DE group CBO_day_3_4 under KO gene LHX5:"
## [1] "number of DE genes from knock out strategy CE: 10"
## [1] "number of DE genes from knock out strategy KO: 7"
## [1] "number of DE genes from knock out strategy PTC: 11"
## [1] "number of common DE genes by knock out strategies CE and KO: 5"
## [1] "number of common DE genes by knock out strategies KO and PTC: 7"
## [1] "number of common DE genes by knock out strategies PTC and CE: 6"
## [1] "hgnc_symbol empty string and NA tables:"
##
## FALSE <NA>
## 18745 4497
##
## FALSE TRUE
## 18745 4497
## [1] "==================================="
## [1] "day group CBO_day_6 DESeq2 results:"
## [1] "==================================="
## [1] "After filtering by gene expression q75, the number of genes reduces from 24202 to 23818"
## [1] "After filtering by nonzero expression in at least 4 samples, the number of genes does not change."
## Time difference of 43.88502 secs
## [1] "prop of non-null for rd: 2.13e-01"
## [1] "DE group CBO_day_6 under KO gene LHX5:"
## [1] "number of DE genes from knock out strategy CE: 4"
## [1] "number of DE genes from knock out strategy KO: 0"
## [1] "number of DE genes from knock out strategy PTC: 0"
## [1] "number of common DE genes by knock out strategies CE and KO: 0"
## [1] "number of common DE genes by knock out strategies KO and PTC: 0"
## [1] "number of common DE genes by knock out strategies PTC and CE: 0"
## [1] "hgnc_symbol empty string and NA tables:"
##
## FALSE <NA>
## 18948 4870
##
## FALSE TRUE
## 18948 4870
## [1] "DE group CBO_day_6 under KO gene FEZF2:"
## [1] "number of DE genes from knock out strategy CE: 3"
## [1] "number of DE genes from knock out strategy KO: 1"
## [1] "number of DE genes from knock out strategy PTC: 1"
## [1] "number of common DE genes by knock out strategies CE and KO: 0"
## [1] "number of common DE genes by knock out strategies KO and PTC: 0"
## [1] "number of common DE genes by knock out strategies PTC and CE: 0"
## [1] "hgnc_symbol empty string and NA tables:"
##
## FALSE <NA>
## 18948 4870
##
## FALSE TRUE
## 18948 4870
## [1] "DE group CBO_day_6 under KO gene RFX4:"
## [1] "number of DE genes from knock out strategy CE: 1"
## [1] "number of DE genes from knock out strategy KO: 6"
## [1] "number of DE genes from knock out strategy PTC: 2"
## [1] "number of common DE genes by knock out strategies CE and KO: 0"
## [1] "number of common DE genes by knock out strategies KO and PTC: 0"
## [1] "number of common DE genes by knock out strategies PTC and CE: 1"
## [1] "hgnc_symbol empty string and NA tables:"
##
## FALSE <NA>
## 18948 4870
##
## FALSE TRUE
## 18948 4870
## [1] "==================================="
## [1] "day group CBO_day_7 DESeq2 results:"
## [1] "==================================="
## [1] "After filtering by gene expression q75, the number of genes reduces from 24202 to 23665"
## [1] "After filtering by nonzero expression in at least 4 samples, the number of genes does not change."
## Time difference of 1.53999 mins
## [1] "prop of non-null for rd: 0.00e+00"
## [1] "rerun DESeq2 without read depth"
## [1] "DE group CBO_day_7 under KO gene FEZF2:"
## [1] "number of DE genes from knock out strategy CE: 1"
## [1] "number of DE genes from knock out strategy KO: 3"
## [1] "number of DE genes from knock out strategy PTC: 0"
## [1] "number of common DE genes by knock out strategies CE and KO: 0"
## [1] "number of common DE genes by knock out strategies KO and PTC: 0"
## [1] "number of common DE genes by knock out strategies PTC and CE: 0"
## [1] "hgnc_symbol empty string and NA tables:"
##
## FALSE <NA>
## 18814 4851
##
## FALSE TRUE
## 18814 4851
## [1] "DE group CBO_day_7 under KO gene PAX6:"
## [1] "number of DE genes from knock out strategy CE: 0"
## [1] "number of DE genes from knock out strategy KO: 0"
## [1] "number of DE genes from knock out strategy PTC: 0"
## [1] "number of common DE genes by knock out strategies CE and KO: 0"
## [1] "number of common DE genes by knock out strategies KO and PTC: 0"
## [1] "number of common DE genes by knock out strategies PTC and CE: 0"
## [1] "hgnc_symbol empty string and NA tables:"
##
## FALSE <NA>
## 18814 4851
##
## FALSE TRUE
## 18814 4851
## [1] "DE group CBO_day_7 under KO gene RFX4:"
## [1] "number of DE genes from knock out strategy CE: 0"
## [1] "number of DE genes from knock out strategy KO: 0"
## [1] "number of DE genes from knock out strategy PTC: 0"
## [1] "number of common DE genes by knock out strategies CE and KO: 0"
## [1] "number of common DE genes by knock out strategies KO and PTC: 0"
## [1] "number of common DE genes by knock out strategies PTC and CE: 0"
## [1] "hgnc_symbol empty string and NA tables:"
##
## FALSE <NA>
## 18814 4851
##
## FALSE TRUE
## 18814 4851
## [1] "DE group CBO_day_7 under KO gene OTX1:"
## [1] "number of DE genes from knock out strategy CE: 2"
## [1] "number of DE genes from knock out strategy KO: 1"
## [1] "number of DE genes from knock out strategy PTC: 4"
## [1] "number of common DE genes by knock out strategies CE and KO: 1"
## [1] "number of common DE genes by knock out strategies KO and PTC: 1"
## [1] "number of common DE genes by knock out strategies PTC and CE: 2"
## [1] "hgnc_symbol empty string and NA tables:"
##
## FALSE <NA>
## 18814 4851
##
## FALSE TRUE
## 18814 4851
## [1] "==================================="
## [1] "day group CBO_day_8 DESeq2 results:"
## [1] "==================================="
## [1] "After filtering by gene expression q75, the number of genes reduces from 24202 to 23000"
## [1] "After filtering by nonzero expression in at least 4 samples, the number of genes does not change."
## Time difference of 27.04115 secs
## [1] "prop of non-null for rd: 0.00e+00"
## [1] "rerun DESeq2 without read depth"
## [1] "DE group CBO_day_8 under KO gene OTX1:"
## [1] "number of DE genes from knock out strategy CE: 84"
## [1] "number of DE genes from knock out strategy KO: 104"
## [1] "number of DE genes from knock out strategy PTC: 6"
## [1] "number of common DE genes by knock out strategies CE and KO: 12"
## [1] "number of common DE genes by knock out strategies KO and PTC: 3"
## [1] "number of common DE genes by knock out strategies PTC and CE: 5"
## [1] "hgnc_symbol empty string and NA tables:"
##
## FALSE <NA>
## 18362 4638
##
## FALSE TRUE
## 18362 4638
## [1] "DE group CBO_day_8 under KO gene FEZF2:"
## [1] "number of DE genes from knock out strategy CE: 0"
## [1] "number of DE genes from knock out strategy KO: 5"
## [1] "number of DE genes from knock out strategy PTC: 0"
## [1] "number of common DE genes by knock out strategies CE and KO: 0"
## [1] "number of common DE genes by knock out strategies KO and PTC: 0"
## [1] "number of common DE genes by knock out strategies PTC and CE: 0"
## [1] "hgnc_symbol empty string and NA tables:"
##
## FALSE <NA>
## 18362 4638
##
## FALSE TRUE
## 18362 4638
## [1] "==================================="
## [1] "day group CBO_day_9 DESeq2 results:"
## [1] "==================================="
## [1] "After filtering by gene expression q75, the number of genes reduces from 24202 to 22923"
## [1] "After filtering by nonzero expression in at least 4 samples, the number of genes does not change."
## Time difference of 22.62518 secs
## [1] "prop of non-null for rd: 2.39e-01"
## [1] "DE group CBO_day_9 under KO gene RFX4:"
## [1] "number of DE genes from knock out strategy CE: 37"
## [1] "number of DE genes from knock out strategy KO: 56"
## [1] "number of DE genes from knock out strategy PTC: 33"
## [1] "number of common DE genes by knock out strategies CE and KO: 19"
## [1] "number of common DE genes by knock out strategies KO and PTC: 22"
## [1] "number of common DE genes by knock out strategies PTC and CE: 14"
## [1] "hgnc_symbol empty string and NA tables:"
##
## FALSE <NA>
## 18321 4602
##
## FALSE TRUE
## 18321 4602
## [1] "DE group CBO_day_9 under KO gene OTX1:"
## [1] "number of DE genes from knock out strategy CE: 37"
## [1] "number of DE genes from knock out strategy KO: 40"
## [1] "number of DE genes from knock out strategy PTC: 44"
## [1] "number of common DE genes by knock out strategies CE and KO: 15"
## [1] "number of common DE genes by knock out strategies KO and PTC: 17"
## [1] "number of common DE genes by knock out strategies PTC and CE: 22"
## [1] "hgnc_symbol empty string and NA tables:"
##
## FALSE <NA>
## 18321 4602
##
## FALSE TRUE
## 18321 4602
Save the size dataframe out
colnames(df_size) = c("DE_group", "knockout_gene_strategy", "n_ko", "n_WT")
df_size = as.data.frame(df_size)
multi_rows = which(table(df_size$knockout_gene_strategy)>1)
multi_rows## FEZF2_CE FEZF2_KO FEZF2_PTC LHX5_CE LHX5_KO LHX5_PTC OTX1_CE OTX1_KO
## 1 2 3 4 5 6 7 8
## OTX1_PTC RFX4_CE RFX4_KO RFX4_PTC
## 9 13 14 15
df_size[which(df_size$knockout_gene_strategy%in%names(multi_rows)),]## DE_group knockout_gene_strategy n_ko
## 1 CBO_day_3_4 LHX5_CE 6
## 2 CBO_day_3_4 LHX5_KO 6
## 3 CBO_day_3_4 LHX5_PTC 6
## 4 CBO_day_6 LHX5_CE 6
## 5 CBO_day_6 LHX5_KO 6
## 6 CBO_day_6 LHX5_PTC 6
## 7 CBO_day_6 FEZF2_CE 3
## 8 CBO_day_6 FEZF2_KO 3
## 9 CBO_day_6 FEZF2_PTC 3
## 10 CBO_day_6 RFX4_CE 3
## 11 CBO_day_6 RFX4_KO 3
## 12 CBO_day_6 RFX4_PTC 3
## 13 CBO_day_7 FEZF2_CE 3
## 14 CBO_day_7 FEZF2_KO 3
## 15 CBO_day_7 FEZF2_PTC 3
## 19 CBO_day_7 RFX4_CE 3
## 20 CBO_day_7 RFX4_KO 3
## 21 CBO_day_7 RFX4_PTC 3
## 22 CBO_day_7 OTX1_CE 3
## 23 CBO_day_7 OTX1_KO 3
## 24 CBO_day_7 OTX1_PTC 3
## 25 CBO_day_8_excluding_20230424_23-scbct-028-run3 OTX1_CE 3
## 26 CBO_day_8_excluding_20230424_23-scbct-028-run3 OTX1_KO 3
## 27 CBO_day_8_excluding_20230424_23-scbct-028-run3 OTX1_PTC 3
## 28 CBO_day_8_excluding_20230424_23-scbct-028-run3 FEZF2_CE 3
## 29 CBO_day_8_excluding_20230424_23-scbct-028-run3 FEZF2_KO 3
## 30 CBO_day_8_excluding_20230424_23-scbct-028-run3 FEZF2_PTC 3
## 31 CBO_day_9 RFX4_CE 3
## 32 CBO_day_9 RFX4_KO 3
## 33 CBO_day_9 RFX4_PTC 3
## 34 CBO_day_9 OTX1_CE 3
## 35 CBO_day_9 OTX1_KO 3
## 36 CBO_day_9 OTX1_PTC 3
## n_WT
## 1 2
## 2 2
## 3 2
## 4 6
## 5 6
## 6 6
## 7 6
## 8 6
## 9 6
## 10 6
## 11 6
## 12 6
## 13 4
## 14 4
## 15 4
## 19 4
## 20 4
## 21 4
## 22 4
## 23 4
## 24 4
## 25 4
## 26 4
## 27 4
## 28 4
## 29 4
## 30 4
## 31 2
## 32 2
## 33 2
## 34 2
## 35 2
## 36 2
fwrite(df_size,
sprintf("%s/%s_DE_n_samples.tsv", output_dir, release_name),
sep="\t")gc()## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 8140278 434.8 12621857 674.1 NA 12621857 674.1
## Vcells 40220222 306.9 107200090 817.9 65536 107197600 817.9
sessionInfo()## R version 4.2.3 (2023-03-15)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur ... 10.16
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] grid stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] readxl_1.4.3 UpSetR_1.4.0
## [3] ComplexHeatmap_2.14.0 data.table_1.15.4
## [5] dplyr_1.1.4 ggpointdensity_0.1.0
## [7] viridis_0.6.5 viridisLite_0.4.2
## [9] DESeq2_1.38.3 SummarizedExperiment_1.28.0
## [11] Biobase_2.58.0 MatrixGenerics_1.10.0
## [13] matrixStats_1.3.0 GenomicRanges_1.50.2
## [15] GenomeInfoDb_1.34.9 IRanges_2.32.0
## [17] S4Vectors_0.36.2 BiocGenerics_0.44.0
## [19] RColorBrewer_1.1-3 MASS_7.3-60
## [21] stringr_1.5.1 ggpubr_0.6.0
## [23] ggrepel_0.9.5 ggplot2_3.5.1
##
## loaded via a namespace (and not attached):
## [1] bitops_1.0-8 bit64_4.0.5 doParallel_1.0.17
## [4] httr_1.4.7 tools_4.2.3 backports_1.5.0
## [7] bslib_0.8.0 utf8_1.2.4 R6_2.5.1
## [10] DBI_1.2.3 colorspace_2.1-1 GetoptLong_1.0.5
## [13] withr_3.0.1 tidyselect_1.2.1 gridExtra_2.3
## [16] bit_4.0.5 compiler_4.2.3 cli_3.6.3
## [19] Cairo_1.6-2 DelayedArray_0.24.0 labeling_0.4.3
## [22] sass_0.4.9 scales_1.3.0 digest_0.6.37
## [25] rmarkdown_2.28 XVector_0.38.0 pkgconfig_2.0.3
## [28] htmltools_0.5.8.1 highr_0.11 fastmap_1.2.0
## [31] GlobalOptions_0.1.2 rlang_1.1.4 rstudioapi_0.16.0
## [34] RSQLite_2.3.7 farver_2.1.2 shape_1.4.6.1
## [37] jquerylib_0.1.4 generics_0.1.3 jsonlite_1.8.8
## [40] BiocParallel_1.32.6 car_3.1-2 RCurl_1.98-1.16
## [43] magrittr_2.0.3 GenomeInfoDbData_1.2.9 Matrix_1.5-4.1
## [46] Rcpp_1.0.13 munsell_0.5.1 fansi_1.0.6
## [49] abind_1.4-5 lifecycle_1.0.4 stringi_1.8.4
## [52] yaml_2.3.10 carData_3.0-5 zlibbioc_1.44.0
## [55] plyr_1.8.9 blob_1.2.4 parallel_4.2.3
## [58] crayon_1.5.3 lattice_0.22-6 Biostrings_2.66.0
## [61] annotate_1.76.0 circlize_0.4.16 KEGGREST_1.38.0
## [64] locfit_1.5-9.10 knitr_1.48 pillar_1.9.0
## [67] rjson_0.2.21 ggsignif_0.6.4 geneplotter_1.76.0
## [70] codetools_0.2-20 XML_3.99-0.17 glue_1.7.0
## [73] evaluate_0.24.0 foreach_1.5.2 vctrs_0.6.5
## [76] png_0.1-8 cellranger_1.1.0 gtable_0.3.5
## [79] purrr_1.0.2 tidyr_1.3.1 clue_0.3-65
## [82] cachem_1.1.0 xfun_0.47 xtable_1.8-4
## [85] broom_1.0.6 rstatix_0.7.2 tibble_3.2.1
## [88] iterators_1.0.14 AnnotationDbi_1.60.2 memoise_2.0.1
## [91] cluster_2.1.6